Table of contents

2. Discrete subgroups

A pair of subgroups

Step 1 \(F_{st}=f\).

Step 2 Simulating the frequency \(P\) of \(m\) loci.

Step 3 Simulating diversed frequencies for two subpopulations \(Frq1=beta(\alpha=\frac{P(1-f)}{f}, \beta=\frac{(1-P)(1-f)}{f})\), \(Frq2=beta(\alpha=\frac{P(1-f)}{f}, \beta=\frac{(1-P)(1-f)}{f})\),

Under \(beta\) distribution, it will have mean of the two population \(\frac{\alpha}{\alpha+\beta}\), and variance \(\frac{\alpha\beta}{(\alpha+\beta)^2(\alpha+\beta)}\)

3. Admixted population

Admixture Dirichlet distribution

Dirichlet distribution wiki

\[D(\alpha_1, \alpha_2, ...\alpha_K)\] \(E(\alpha_i)=\frac{\alpha_i}{\sum_1^K\alpha_i}\), and \[var(\alpha_i)=\frac{\tilde{\alpha}_i(1-\tilde{\alpha}_i)}{\alpha_0+1}\] \[cov(\alpha_i, \alpha_j)=\frac{-\tilde{\alpha}_i\tilde{\alpha}_j}{\alpha_0+1}\]

in which \(\tilde{\alpha}_i=\frac{\alpha_i}{\sum_1^K\alpha_i}\) and \(\alpha_0=\sum_1^K\alpha_i\).

## Loading required package: coda
## Loading required package: MASS
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2020 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##

Six pops

M=c(10000)
fst1=0.05
fst2=0.01
for(m in 1:length(M)) {
  P=runif(M[m], 0.2, 0.8)
  Sz=c(100, 100, 100, 100, 100, 100)
  Fq=matrix(0, length(Sz), M[m])
  Fq[1,]=rbeta(M[m], P*(1-fst1)/fst1, (1-P)*(1-fst1)/fst1)

  P3=rbeta(M[m], P*(1-fst1)/fst1, (1-P)*(1-fst1)/fst1)

  P2=(0.5*Fq[1,]+0.5*P3)

  Fq[3,]=rbeta(M[m], P2*(1-fst2)/fst2, (1-P2)*(1-fst2)/fst2)
  Fq[4,]=rbeta(M[m], P2*(1-fst2)/fst2, (1-P2)*(1-fst2)/fst2)
  
  Fq[2,]=rbeta(M[m], P3*(1-fst2)/fst2, (1-P3)*(1-fst2)/fst2)
  Fq[5,]=rbeta(M[m], P3*(1-fst2)/fst2, (1-P3)*(1-fst2)/fst2)
  Fq[6,]=rbeta(M[m], P3*(1-fst2)/fst2, (1-P3)*(1-fst2)/fst2)

  G=matrix(0, sum(Sz), M[m])
  for(i in 1:Sz[1]) {
    G[i,] = rbinom(M[m], 2, Fq[1,])
  }
  
  for(i in (Sz[1]+1):(Sz[1]+Sz[2])) {
    G[i,] = rbinom(M[m], 2, Fq[2,])
  }
  
  for(i in (Sz[1]+Sz[2]+1):sum(Sz)) {
    G[i,] = rbinom(M[m], 2, Fq[3,])
  }

  for(i in (Sz[1]+Sz[2]+Sz[3]+1):sum(Sz)) {
    G[i,] = rbinom(M[m], 2, Fq[4,])
  }

  for(i in (Sz[1]+Sz[2]+Sz[3]+Sz[4]+1):sum(Sz)) {
    G[i,] = rbinom(M[m], 2, Fq[5,])
  }

  for(i in (Sz[1]+Sz[2]+Sz[3]+Sz[4]+Sz[5]+1):sum(Sz)) {
    G[i,] = rbinom(M[m], 2, Fq[6,])
  }

  Gs=apply(G, 2, scale)
  GG=Gs %*% t(Gs)
  EigenG=eigen(GG)
  layout(matrix(1:2, 1, 2))
  barplot(EigenG$values[1:20])
  plot(EigenG$vectors[,1], EigenG$vectors[,2])
  layout(matrix(1:6, 2, 3, byrow=T))
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[1:100,1], EigenG$vectors[1:100,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[101:200,1], EigenG$vectors[101:200,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[201:300,1], EigenG$vectors[201:300,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[301:400,1], EigenG$vectors[301:400,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[401:500,1], EigenG$vectors[401:500,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
  plot(xlim=range(EigenG$vectors[,1]), ylim=range(EigenG$vectors[,2]), main=paste(M[m], "markers"), EigenG$vectors[501:600,1], EigenG$vectors[501:600,2], bty='n', xlab="PC 1", ylab="PC 2", pch=16)
}

4 PCA methods

5 Homo and hetro

Heterogeneous cohort

rep=1
EV=matrix(0, rep, 10)
typeI=matrix(0, rep, 4)
fst=c(0.002, 0.01)
N1=c(100, 500, 1000)
N2=1000
Ml=c(8000, 2000)
M=sum(Ml)
ME=matrix(0, length(N1), 3)
for(s in 1:length(N1)) {
  for(r in 1:rep) {
    P=runif(M, 0.2, 0.8)
    p1=runif(Ml[1], 0.2, 0.8)
    p2=runif(Ml[2], 0.2, 0.8)
    P=c(p1, p2)
    P1=c(rbeta(Ml[1], p1*(1-fst[1])/fst[1], (1-p1)*(1-fst[1])/fst[1]),
         rbeta(Ml[2], p2*(1-fst[2])/fst[2], (1-p2)*(1-fst[2])/fst[2]))
    P2=c(rbeta(Ml[1], p1*(1-fst[1])/fst[1], (1-p1)*(1-fst[1])/fst[1]),
         rbeta(Ml[2], p2*(1-fst[2])/fst[2], (1-p2)*(1-fst[2])/fst[2]))

    Gn=matrix(0, nrow=N1[s]+N2, ncol=M)
    for(i in 1:N1[s]) {
      Gn[i,] = rbinom(M, 2, P1)
    }

    for(i in (N1[s]+1):(N2+N1[s])) {
      Gn[i,] = rbinom(M, 2, P2)
    }
    Frq1=apply(Gn[1:N1[s],], 2, mean)/2
    Frq2=apply(Gn[(N1[s]+1):(N1[s]+N2),], 2, mean)/2
    FrqM=apply(Gn, 2, mean)/2
    Fst=2*(N1[s]/(N1[s]+N2)*(Frq1-FrqM)^2 + N2/(N1[s]+N2) * (Frq2-FrqM)^2)/(FrqM*(1-FrqM))
    FstN=(Frq1-Frq2)^2/(2*FrqM*(1-FrqM))
    plot(Fst, FstN)
  }

  GnS=apply(Gn, 2, scale)
  G=GnS %*% t(GnS)/M
  EigenGN=eigen(G)
  
  RegB=matrix(0, M, 4)
  for(i in 1:M) {
    mod=lm(EigenGN$vectors[,1]~Gn[,i])
    RegB[i,1]=summary(mod)$coefficients[2,1]
    RegB[i,2]=summary(mod)$coefficients[2,2]
  }
  RegB[,3]=RegB[,1]^2/RegB[,2]^2
  qqplot(rchisq(M, 1), RegB[,3], bty='n', pch=16)
  abline(a=0, b=1, col="red")
  gc=median(RegB[,3])/0.455
  abline(a=0, b=gc, col="blue")
  
  ME[s,1]=mean(Fst)*(N1[s]+N2)
  ME[s,2]=gc
  ME[s,3]=EigenGN$values[1]
}
rownames(ME)=N1
barplot(t(ME), beside = T, border = F)
legend("topleft", legend = c("Fst", "GC", "Eigenvalue"), pch=15, col=c("black", "grey50", "grey"), bty='n')

6 Power analysis

Shiny link for power analysis

Related parameters:

\(p_1\) and \(p_2\) are the allele frequencies for the reference allele at two sub groups.

\(n_1\) and \(n_2\) are sample sizes for two subgroups, respectively. \(n=n_1+n_2\) the total sample size.

\(\omega_1=\frac{n_1}{n_1+n_2}\), and \(\omega_2=\frac{n_2}{n_1+n_2}\).

\(p=\omega_1p_1+\omega_2p_2\)

7 EigenGWAS pipeline

Manhattan plot function

manhattan <- function(dataframe, colors=c("gray10", "gray50"), ymax="max", limitchromosomes=NULL, suggestiveline=-log10(1e-5), genomewideline=NULL, title="", annotate=NULL, ...) {
  
  d=dataframe
  if (!("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d))) stop("Make sure your data frame contains columns CHR, BP, and P")
  if (!is.null(limitchromosomes)) {
    d=d[d$CHR %in% limitchromosomes, ]
  }
  
  d=subset(na.omit(d[order(d$CHR, d$BP), ]), (P>0 & P<=1)) # remove na's, sort, and keep only 0<P<=1
  d$logp = -log10(d$P)
  d$pos=NA
  ticks=NULL
  lastbase=0 #  colors <- rep(colors,max(d$CHR))[1:max(d$CHR)]
  colors <- rep(colors,max(d$CHR))[1:length(unique(d$CHR))]
  
  if (ymax=="max") ymax<-ceiling(max(d$logp))
  if (ymax<8) ymax<-8
  numchroms=length(unique(d$CHR))
  if (numchroms==1) {
    d$pos=d$BP
    ticks=floor(length(d$pos))/2+1
  } else {
    Uchr=unique(d$CHR)
    for (i in 1:length(Uchr)) {
      if (i==1) {
        d[d$CHR==Uchr[i], ]$pos=d[d$CHR==Uchr[i], ]$BP
      } else {
        lastbase=lastbase+tail(subset(d, CHR==Uchr[i-1])$BP, 1)
        d[d$CHR==Uchr[i], ]$pos=d[d$CHR==Uchr[i], ]$BP+lastbase
      }
      ticks=c(ticks, d[d$CHR==Uchr[i], ]$pos[floor(length(d[d$CHR==Uchr[i], ]$pos)/2)+1])
    }
  }
  if (numchroms==1) {
    with(d, plot(main=title, pos, logp, ylim=c(0,ymax), ylab=expression(-log[10](italic(p))), xlab=paste("Chromosome",unique(d$CHR),"position"), ...))
  } else {
    with(d, plot(main=title, pos, logp, ylim=c(0,ymax), ylab=expression(-log[10](italic(p))), xlab="Chromosome", xaxt="n", type="n", ...))
    axis(1, at=ticks, lab=unique(d$CHR), ...)
    icol=1
    Uchr=unique(d$CHR)
    for (i in 1:length(Uchr)) {
      with(d[d$CHR==Uchr[i], ], points(pos, logp, col=colors[icol], ...))
      icol=icol+1
    }
  }
  if (!is.null(annotate)) {
    d.annotate=d[which(d$SNP %in% annotate), ]
    with(d.annotate, points(pos, logp, col="green3", ...))
  }
  #  if (suggestiveline) abline(h=suggestiveline, col="blue")
  if (!is.null(genomewideline)) {
    abline(h=genomewideline, col="gray")
  } else {
    abline(h=-log10(0.05/nrow(d)), col="gray")    
  }
}

Random mating population

Replace the plink with your own path. Using HapMap CEU (northwest European) vs TSI (south Europeans) as example. The data can be found here.

plink2='/Users/gc5k/bin/plink_mac/plink' #replace it with your own path
dat="../data/euro/euro_10K"
layout(matrix(1:6, 2, 3))

#make-grm
grmCmd=paste(plink2, "--bfile ", dat, "--make-grm-gz --out ", dat)
system(grmCmd)

gz=gzfile(paste0(dat, ".grm.gz"))
grm=read.table(gz, as.is = T)
Ne=-1/mean(grm[grm[,1]!=grm[,2], 4])
Me=1/var(grm[grm[,1]!=grm[,2], 4])
print(paste("Ne=", format(Ne, digits = 2), "Me=", format(Me, digits = 2)))
## [1] "Ne= 199 Me= 18440"
hist(grm[grm[,1]!=grm[,2],4], main="Pairwise relatedness", xlab="Relatedness score", breaks = 50)

#pca
pcaCmd=paste(plink2, "--bfile ", dat, "--pca 10 --out ", dat)
system(pcaCmd)
barplot(main="Top 10 eigenvalue", read.table(paste0(dat, ".eigenval"), as.is = T)[,1], border = F)
abline(h=1, col="red", lty=2, lwd=3)

pc=read.table(paste0(dat, ".eigenvec"), as.is = T)
plot(pc[,3], pc[,4], xlab="Eigenvector 1", ylab="Eigenvector 2", bty="n", main="Eigenspace", bty="n", col=ifelse(pc[,3]>0, "red", "blue"), pch=16, cex=0.5)

#EigenGWAS
liCmd=paste0(plink2, " --linear --bfile ", dat, " --pheno ", dat, ".eigenvec --out ", dat)
system(liCmd)

#plot
EigenRes=read.table(paste0(dat, ".assoc.linear"), as.is = T, header = T)
EigenRes$Praw=EigenRes$P
gc=qchisq(median(EigenRes$P), 1, lower.tail = F)/qchisq(0.5, 1, lower.tail = F)
print(paste("GC = ", format(gc, digits = 4)))
## [1] "GC =  1.701"
EigenRes$P=pchisq(qchisq(EigenRes$Praw, 1, lower.tail = F)/gc, 1, lower.tail = F)
manhattan(EigenRes, title="EigenGWAS 1", pch=16, cex=0.3, bty='n')

#QQplot
chiseq=rchisq(nrow(EigenRes), 1)
qqplot(chiseq, qchisq(EigenRes$Praw, 1, lower.tail = F), xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), bty="n", col="grey", pch=16, cex=0.5)
points(sort(chiseq), sort(qchisq(EigenRes$P, 1, lower.tail = F)), col="black", pch=16, cex=0.5)
legend("topleft", legend = c("Raw", "GC correction"), pch=16, cex=0.5, col=c("grey", "black"), bty='n')
abline(a=0, b=1, col="red", lty=2)

Inbred population

Replace the plink with your own path. The Arabidopsis data can be found here.

plink2='/Users/gc5k/bin/plink_mac/plink' #replace it with your own path
dat="../data/arab/arab"

layout(matrix(1:6, 2, 3))
#make-grm
grmCmd=paste(plink2, "--bfile ", dat, "--make-grm-gz --out ", dat)
system(grmCmd)

gz=gzfile(paste0(dat, ".grm.gz"))
grm=read.table(gz, as.is = T)
Ne=-1/mean(grm[grm[,1]!=grm[,2], 4]/2)
Me=1/var(grm[grm[,1]!=grm[,2], 4]/2)
print(paste("Ne=", format(Ne, digits = 2), "Me=", format(Me, digits = 2)))
## [1] "Ne= 293 Me= 395"
hist(grm[grm[,1]!=grm[,2],4]/2, main="Pairwise relatedness", xlab="Relatedness score", breaks = 50)

#pca
pcaCmd=paste(plink2, "--bfile ", dat, "--pca 10 --out ", dat)
system(pcaCmd)
barplot(main="Top 10 eigenvalue", read.table(paste0(dat, ".eigenval"), as.is = T)[,1]/2, border = F)
abline(h=1, col="red", lty=2, lwd=3)

pc=read.table(paste0(dat, ".eigenvec"), as.is = T)
plot(pc[,3], pc[,4], xlab="Eigenvector 1", ylab="Eigenvector 2", bty="n", main="Eigenspace", bty="n", col=ifelse(pc[,3]>0, "red", "blue"), pch=16, cex=0.5)

#EigenGWAS
liCmd=paste0(plink2, " --linear --bfile ", dat, " --pheno ", dat, ".eigenvec --out ", dat)
system(liCmd)

#plot
EigenRes=read.table(paste0(dat, ".assoc.linear"), as.is = T, header = T)
EigenRes$Praw=EigenRes$P
gc=qchisq(median(EigenRes$P), 1, lower.tail = F)/qchisq(0.5, 1, lower.tail = F)
print(paste("GC = ", format(gc, digits = 4)))
## [1] "GC =  9.047"
EigenRes$P=pchisq(qchisq(EigenRes$Praw, 1, lower.tail = F)/gc, 1, lower.tail = F)
manhattan(EigenRes, title="EigenGWAS 1", pch=16, cex=0.3, bty='n')

#QQplot
chiseq=rchisq(nrow(EigenRes), 1)
qqplot(chiseq, qchisq(EigenRes$Praw, 1, lower.tail = F), xlab=expression(paste("Theoretical ", chi[1]^2)), ylab=expression(paste("Observed ", chi[1]^2)), bty="n", col="grey", pch=16, cex=0.5)
points(sort(chiseq), sort(qchisq(EigenRes$P, 1, lower.tail = F)), col="black", pch=16, cex=0.5)
legend("topleft", legend = c("Raw", "GC correction"), pch=16, cex=0.5, col=c("grey", "black"), bty='n')
abline(a=0, b=1, col="red", lty=2)